We need to import the intersect data from our data. Here is what our data looks like:

scaffold    qstart    qend    query   type    seqname    seqstart   seqend  attribute
---
1   39310   39334   uce-127282_p11  intron  1   39281   39334
1   39334   39419   uce-127282_p11  exon    1   39335   39463   Parent=transcript:ENSMPTT00005003008;constitutive=0;exon_id=ENSMPTE00005013128;rank=8;version=1
...
1   108104  108225  uce-146693_p5   intergenic  1   72314   146673
...

(how intersect was created can be found here: https://github.com/crcardenas/Adephaga_UCE/blob/main/workflow.md)

Load Library

library(tidyr) # data clean up
library(dplyr) # data cleanup
library(readr) # for importing
library(ggplot2) # for plotting 
library(scales) # additional package for plotting
library(ggtext) # additional package for plotting
#library(psych) # for statistics, if necessary
# library(GenomicFeatures) # my not actually need this since we are doing our own thing

Load data

We have two files because of different GFF attribute fields for genes and exons. These can be joined later for manipulation, but may not need to be. There is no header information so we will need to add the header info I described above

d.intro_exon <- read_tsv(file="../Adephaga2.9-pterMadi2.introns-exons.out.intersect", col_names = F, na = c("","NA"))
## Rows: 14952 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): X1, X4, X5, X6, X9
## dbl (4): X2, X3, X7, X8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d.inter_gene <- read_tsv(file="../Adephaga2.9-pterMadi2.intergenic-genentic.out.intersect", col_names = F, na = c("","NA"))
## Rows: 15956 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): X1, X4, X5, X6, X9
## dbl (4): X2, X3, X7, X8
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(d.intro_exon) <- c("scaffold","qstart","qend","query","type","seqname","seqstart","seqend","attribute")
colnames(d.inter_gene) <- c("scaffold","qstart","qend","query","type","seqname","seqstart","seqend","attribute")
d.intro_exon
d.inter_gene

add new columns for each file

  1. split out query column, one for UCE and one for UCE probe (UCE#_p##)
  1. split out GFF attribute column if present:
df.intro_exon <- d.intro_exon %>% separate_wider_delim(cols = query, 
                           delim = "_", 
                           names = c("uce","probe")) %>% 
  separate_wider_delim(cols = attribute, 
                       delim = ";",
                       names = c("parent","constitutive","ID","rank","version")) %>% 
  separate_wider_delim(cols = parent,
                       delim = ":",
                       names = c("parent","transcript")) %>% 
  separate_wider_delim(cols = ID,
                       delim = "=", 
                       names = c("gff_attribute","exon_id")) %>% 
  mutate(query=paste(uce,probe, sep="_")) %>% 
  select(scaffold, qstart, qend, query, uce, probe, type, seqname, seqstart, seqend, transcript, exon_id)
df.intro_exon
df.inter_gene <- d.inter_gene %>% separate_wider_delim(cols = query, 
                           delim = "_", 
                           names = c("uce","probe")) %>% 
  separate_wider_delim(cols = attribute, 
                       delim = ";",
                       names = c("ID","biotype","geneID","version")) %>% 
  separate_wider_delim(cols = biotype,
                       delim = "=",
                       names = c("gff_attribute1","biotype")) %>% 
  separate_wider_delim(cols = ID,
                       delim = ":",
                       names = c("gff_attribute2","gene_id")) %>% 
  mutate(query=paste(uce,probe, sep="_")) %>% 
  select(scaffold, qstart, qend, query, uce, probe, type, seqname, seqstart, seqend, biotype, gene_id)
df.inter_gene

Summary

UCEs that map to exons

# df.intro_exon %>% distinct(uce, .keep_all = T) %>%  count(type) 
# 
# #uce_by_exon <- df.intro_exon %>% distinct(uce, .keep_all = T) %>%  group_by(exon_id) %>% summarize(uce_count = sum(uce >= 1)) 
# 
# uce_by_exon2 <- uce_by_exon %>% filter(uce_count > 1) %>% na.omit()
# 
# psych::describe(uce_by_exon2)
# df.inter_gene %>% distinct(uce, .keep_all = T) %>%  count(type)
# 
# uce_count_by_gene <- df.inter_gene %>% distinct(uce, .keep_all = T) %>%  group_by(gene_id) %>% summarize(uce_count = sum(uce >= 1))
# 
# uce_count_by_gene2 <- uce_count_by_gene %>% filter(uce_count > 1) %>% na.omit() # na.omit gets rid of the intergenic UCEs (all 769 of them)
# 
# psych::describe(na.omit(uce_count_by_gene$uce_count)) # includes the 769 loci coming from the intergenic regions
# psych::describe(na.omit(uce_count_by_gene2$uce_count))

Genetic & intergenic

we need to make one more category that best characterizes as genetic, intergenic, or both

  scaffold  qstart    qend query         uce        probe type       seqname seqstart  seqend biotype        gene_id           
  <chr>      <dbl>   <dbl> <chr>         <chr>      <chr> <chr>      <chr>      <dbl>   <dbl> <chr>          <chr>             
1 1        1011217 1011282 uce-71245_p7  uce-71245  p7    gene       1         982581 1011637 protein_coding ENSMPTG00005003053
2 1        1011217 1011329 uce-71245_p8  uce-71245  p8    gene       1         982581 1011637 protein_coding ENSMPTG00005003053
3 1        1018428 1018429 uce-267689_p7 uce-267689 p7    intergenic 1        1011637 1018429 NA             NA                
4 1        1018429 1018545 uce-267689_p7 uce-267689 p7    gene       1        1018430 1035525 protein_coding ENSMPTG00005001153

see second UCE here, it is both intergenic and genetic… these are the only variables we are worried about right now.

  1. first group by UCEs
  2. then create a new column called category with mutate
  • if the type column has more than one distinct type (e.g., both intergenic and genetic) give the column “intergenic_genetic”
  1. ungroup the UCEs
  2. now that we have this new category, we can use distinct to get the counts of the UCE features
category.inter_gene <- df.inter_gene %>%  
  group_by(uce) %>% 
  mutate(category = if (n_distinct(type) > 1) 'intergenic_genetic' else unique(type)) %>%
  ungroup() %>% 
  select(scaffold, uce, probe, type, category)
category.inter_gene.all <- category.inter_gene %>% distinct(uce, .keep_all = T) %>% select(category) %>% table() %>% as_tibble()
category.inter_gene.all
category.inter_gene.bychromosome <- category.inter_gene %>% group_by(scaffold) %>% distinct(uce, .keep_all = T) %>% select(category) %>% table()
## Adding missing grouping variables: `scaffold`
category.inter_gene.bychromosome
##         category
## scaffold gene intergenic intergenic_genetic
##       1   106         38                 10
##       10  115         36                  7
##       11  114         47                  5
##       12  154         40                  8
##       13   60         12                  4
##       14   57         11                  5
##       15   15          2                  2
##       16   28         11                  3
##       17   54         19                  5
##       18    9          2                  2
##       2   176         93                 15
##       3   139         47                 10
##       4    85         35                  4
##       5   117         62                 13
##       6   121         32                  9
##       7   154         57                  6
##       8    98         20                  7
##       9   127        112                 14
##       X   136         37                  8
#create our dataframe
category.df.inter_gene.all <- data.frame(
  genomic_feature= category.inter_gene.all$.,
  uce_count = category.inter_gene.all$n) %>% 
  mutate( proportion = round(uce_count / sum(uce_count), 4))
category.df.inter_gene.all
# create a pie chart
pb.category.inter_gene.all <- category.df.inter_gene.all %>% 
  ggplot(aes(x="", y=proportion, fill=reorder(genomic_feature,proportion))) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start = 0) +
  scale_fill_grey() +
  theme(axis.title.y = element_blank(),
         axis.title.x = element_blank(),
         panel.border = element_blank(),
         panel.grid = element_blank(),
         axis.ticks = element_blank(),
         axis.text.x = element_blank(),
         panel.background = element_blank(),
         plot.title = element_text(hjust=0.5)) +
  geom_text(aes(label = percent(proportion, accuracy = 0.01), fontface=2),
            position = position_stack(vjust=0.5)) +
  ggtitle("UCE characterized as genetic or intergenic") +
  labs(fill="Genomic feature") +
  guides(fill=guide_legend(reverse=T))
pb.category.inter_gene.all

UCEs that map to gene features

# df.intro_exon %>% distinct(uce, .keep_all = T) %>%  count(type) 
# 
# #uce_by_exon <- df.intro_exon %>% distinct(uce, .keep_all = T) %>%  group_by(exon_id) %>% summarize(uce_count = sum(uce >= 1)) 
# 
# uce_by_exon2 <- uce_by_exon %>% filter(uce_count > 1) %>% na.omit()
# 
# psych::describe(uce_by_exon2)
category.intro_exon <- df.intro_exon %>%
  group_by(uce) %>%
  mutate(category = if (n_distinct(type) > 1) 'intron_exon' else unique(type)) %>%
  ungroup() %>%
  select(scaffold, uce, probe, type, category)
category.intro_exon.all <- category.intro_exon %>% distinct(uce, .keep_all = T) %>% select(category) %>% table() %>% as_tibble()
category.intro_exon.all
category.intro_exon.bychromosome <- category.intro_exon %>% group_by(scaffold) %>% distinct(uce, .keep_all = T) %>% select(category) %>% table()
## Adding missing grouping variables: `scaffold`
category.intro_exon.bychromosome
##         category
## scaffold exon intron intron_exon
##       1    66      5          45
##       10   69      4          47
##       11   64      5          52
##       12  106      7          50
##       13   36      2          26
##       14   34      2          27
##       15    9      0           8
##       16   22      1           8
##       17   34      4          21
##       18    9      0           2
##       2   113     14          66
##       3    86      7          58
##       4    46      4          40
##       5    78      9          46
##       6    68      6          56
##       7   110      4          47
##       8    58      3          46
##       9    73     13          57
##       X    60     18          67
#create our dataframe
category.df.intro_exon.all <- data.frame(
  genomic_feature= category.intro_exon.all$.,
  uce_count = category.intro_exon.all$n) %>%
  mutate( proportion = round(uce_count / sum(uce_count), 4)) %>% 
  arrange(desc(proportion))
category.df.intro_exon.all
# create a pie chart
pb.category.intro_exon.all <- category.df.intro_exon.all %>% 
  ggplot(aes(x="", y=proportion, fill=reorder(genomic_feature,uce_count))) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start = 0) +
  scale_fill_grey() +
  theme(axis.title.y = element_blank(),
         axis.title.x = element_blank(),
         panel.border = element_blank(),
         panel.grid = element_blank(),
         axis.ticks = element_blank(),
         axis.text.x = element_blank(),
         panel.background = element_blank(),
         plot.title = element_text(hjust=0.5)) +
  geom_text(aes(label = percent(proportion, accuracy = 0.01), fontface=2),
            position = position_stack(vjust=0.5)) +
  ggtitle("Genetic UCEs characterized as intron or exon")  +
  labs(fill="Gene feature") +
  guides(fill=guide_legend(reverse=T))
pb.category.intro_exon.all

multi-UCE genes

next we need to characterize the number of UCEs per gene. Here is an example from the df.inter_gene with three different UCEs mapping to the same gene

scaffold   qstart     qend query          uce        probe type  seqname seqstart   seqend biotype        gene_id          
X        35120719 35120839 uce-123899_p11 uce-123899 p11   gene  X       35120216 35135278 protein_coding ENSMPTG00005012093
X        35120759 35120879 uce-123899_p12 uce-123899 p12   gene  X       35120216 35135278 protein_coding ENSMPTG00005012093
X        35129993 35130087 uce-17802_p12  uce-17802  p12   gene  X       35120216 35135278 protein_coding ENSMPTG00005012093
X        35130007 35130127 uce-17802_p11  uce-17802  p11   gene  X       35120216 35135278 protein_coding ENSMPTG00005012093
X        35131551 35131671 uce-123823_p12 uce-123823 p12   gene  X       35120216 35135278 protein_coding ENSMPTG00005012093
X        35131591 35131711 uce-123823_p11 uce-123823 p11   gene  X       35120216 35135278 protein_coding ENSMPTG00005012093
category.gene_uce <- df.inter_gene %>%
  group_by(gene_id) %>%
  mutate(gene_id = 
           if (type == 'intergenic') 'intergenic' else gene_id) %>% # ignoring warning for now
  mutate(uce_count = 
           case_when(
             n_distinct(uce) == 1 ~ 'n=1',
             n_distinct(uce) == 2 ~ 'n=2',
             n_distinct(uce) == 3 ~ 'n=3',
             n_distinct(uce) == 4 ~ 'n=4',
             n_distinct(uce) == 5 ~ 'n=5')) %>%
  ungroup() %>%
  select(scaffold, uce, probe, type, gene_id, uce_count)
category.gene_uce
category.gene_uce.all <- category.gene_uce %>% distinct(gene_id, .keep_all = T) %>% select(uce_count) %>% table() %>% as_tibble()
category.gene_uce.all
category.gene_uce.bychromosome <- category.gene_uce %>% group_by(scaffold) %>% distinct(gene_id, .keep_all = T) %>% select(uce_count) %>% table()
## Adding missing grouping variables: `scaffold`
category.gene_uce.bychromosome
##         uce_count
## scaffold n=1 n=2 n=3 n=4 n=5
##       1   84  10   4   0   0
##       10  96  10   3   0   0
##       11  85   9   4   0   1
##       12 111  26   1   0   0
##       13  59   4   0   0   0
##       14  48   5   2   0   0
##       15  16   2   0   0   0
##       16  27   2   0   0   0
##       17  44   5   2   0   0
##       18   9   1   0   0   0
##       2  125  26   5   1   0
##       3  110  17   2   1   0
##       4   55  12   4   0   0
##       5   80  18   3   1   1
##       6   89  12   5   1   0
##       7  109  15   5   2   0
##       8   70  17   1   0   1
##       9  101  17   3   0   0
##       X  102  11   4   2   0

Next we create a data frame and plot our results, only plot N > 1 UCEs per gene

# create our data frame from our object class table
category.df.gene_uce.all <- data.frame(
  uce_per_gene = category.gene_uce.all$.,
  genes_total = category.gene_uce.all$n) %>% 
  filter(uce_per_gene!='n=1')
category.df.gene_uce.all
# create barplot
bp.gene_uce.all <- category.df.gene_uce.all %>%  
  ggplot(aes(x=uce_per_gene,y=genes_total)) +
  geom_bar(stat="identity") +
  annotate("text",x=3.5, y=150,
           label=
"genetic UCEs: 1863\nintergenic UCEs: 710\nintergenic & genetic UCEs: 131 *\n
genes with >=1 UCEs: 1698\nexonic UCEs: 1141\nintronic & exonic UCEs 766 *\nintronic UCEs: 108") +
  geom_text( aes(label=genes_total), vjust=-1) +
  scale_y_continuous(limits=c(0,225)) +
  labs(x="UCE per gene",
       y="total genes",
       title= expression("Adephaga 2.9k UCEs within *Pterostichus madidus* genes")) +
  theme_bw() +
  theme(plot.title = ggtext::element_markdown())
bp.gene_uce.all

Something is off in the counting, things arent adding up. I’ve identified 1698 genes, with one or more UCE (e.g., 219 with 2, 48 with 3, etc.) BUT the cumulative of all identfied exonic, intronic & exonic, and intronic UCEs is greater.

The total number of mapped UCEs is 2704, which tracks with the genetic/intergenic calculations. However, something is being counted wrong (or my interpretation of the counting is way off). 1698 genes with 1 or more UCEs mapped to them…. but when I count specific gene features (gff, intronict, or both) the sum is much greater. For example, 219 genes have two (438 UCEs), 48 genes have three (144), eight genes have four (32), and three have five (15); for a total of 629 multi-genic UCEs. There are 1420 genes with one UCE, plus the other genes = 1698 genes. cool cool cool. there are 629 + 1420 = 2049 genic UCEs. But this summatoin does not equal the 1863 genetic UCEs identified previously calculated (even including 131 both intergenic and genetic uces: 1994)

these calculations are all so off some how.

# uces_in_genes <- uce_count_by_gene2 %>% ggplot(aes(x=uce_count)) +
#   geom_bar(stat="count") +
#     annotate("text",
#            x=4.5, y=150,
#            label="intergenic UCEs: 770\ngenes with UCEs: 1934\n>1 UCE per gene: 257\nUCEs in exons: 1420\nUCEs in introns: 595") +
#   geom_text(stat= 'count',
#             aes(label=..count..),
#             vjust=-1) +
#   scale_y_continuous(limits=c(0,225)) +
# #  theme_update(plot.title = element_text(hjust = 0.5)) +
#   labs(x="UCE per gene",
#        y="total genes",
#        title= expression("Adephaga 2.9k UCEs present in *Pterostichus madidus* genome")) +
#   theme_bw() +
#   theme(plot.title = ggtext::element_markdown())
# uces_in_genes

Trying to solve the counting issue. genes w/ UCE


sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggtext_0.1.2  scales_1.1.1  ggplot2_3.3.5 readr_2.1.3   dplyr_1.0.10 
## [6] tidyr_1.3.0  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0 xfun_0.36        bslib_0.4.2      purrr_1.0.1     
##  [5] colorspace_2.0-2 vctrs_0.5.2      generics_0.1.3   htmltools_0.5.4 
##  [9] yaml_2.3.7       utf8_1.2.2       rlang_1.0.6      gridtext_0.1.5  
## [13] jquerylib_0.1.4  pillar_1.8.1     glue_1.6.2       withr_2.5.0     
## [17] DBI_1.1.3        bit64_4.0.5      lifecycle_1.0.3  stringr_1.5.0   
## [21] munsell_0.5.0    gtable_0.3.0     evaluate_0.15    labeling_0.4.2  
## [25] knitr_1.37       tzdb_0.3.0       fastmap_1.1.0    parallel_4.1.2  
## [29] markdown_1.1     fansi_1.0.4      highr_0.9        Rcpp_1.0.10     
## [33] cachem_1.0.6     vroom_1.6.1      jsonlite_1.8.4   farver_2.1.0    
## [37] bit_4.0.5        hms_1.1.2        digest_0.6.31    stringi_1.7.12  
## [41] grid_4.1.2       cli_3.6.0        tools_4.1.2      magrittr_2.0.3  
## [45] sass_0.4.0       tibble_3.1.8     crayon_1.5.2     pkgconfig_2.0.3 
## [49] ellipsis_0.3.2   xml2_1.3.3       assertthat_0.2.1 rmarkdown_2.20  
## [53] rstudioapi_0.13  R6_2.5.1         compiler_4.1.2